clear all 
set more off 


* FIGURE D.4, PANEL B

*---------------------------------------------------------------------*
* TSIV RESULTS --- 1910-1930 (1940 CENSUS DATA COMES FROM NBER SERVER)
*---------------------------------------------------------------------*

* Bring in data 
	use "$Mydirectory1/3_Output/TSIV_2Samples_1940Census.dta", clear 

	forval i=1(1)3 {
		gen est_`i'=. 
		gen est_lb_`i' =.
		gen est_ub_`i' =.
	}
	
*-------------------------------------*
* Regressions  *
*-------------------------------------*

* Loop through each decade 	
	forval i=1920(10)1930 {
		
		noisily display "Year `i'"

		preserve
		
	* Restrict sample and set up triplets
		keep if survey1936==1 | (census==1 & problem_occs!=1) | (surveys==1 & decade==`i')
		replace census=1 if survey1936==1
	
		local var_list "fatheroccej race south_merge" 
		egen triplet = group(`var_list')
		
		bysort census triplet: gen tag = _n==1 //tag first observation of triplet
		bysort triplet: egen number_samples = sum(tag) 
		keep if number_samples==2 //keep if triplet is in both census and surveys
		
	* Remake the dummies for triplets that exist in both datasets
		drop triplet* 
		egen triplet = group(`var_list')
		quietly tab triplet, gen(triplet_)
		local max = `r(r)'
		local max_minus1 = `r(r)'-1 //grabs # of dummies minus 1
		drop triplet_`max'
		
	* Save the dataset to be used for all regressions 
		tempfile restricted_data_`i'
		save `restricted_data_`i''
		
	*------------------------------*
  *------------------------------* 
		
	* 1. Baseline regression

		noisily display "Baseline method"
		
		local tempie "log_father_HHinc_1936fix"
		reg log_son_baseline `tempie'  if surveys==1 [aw=wgt_sex_race], robust
		
		restore 
		
		* Bring back full dataset and save estimates 
		quietly replace est_1 = _b[`tempie'] if decade==`i'
		quietly replace est_ub_1 = _b[`tempie']+1.96*_se[`tempie'] if decade==`i'
		quietly replace est_lb_1 = _b[`tempie']-1.96*_se[`tempie'] if decade==`i'
		
		
    * 2. TSIV without any corrections --- naive approach

		noisily display "Two-steps: log then avg, naive"

		preserve
		
		use `restricted_data_`i'', clear
		
		//Naive version without standard error correction 
		quietly reg log_father_income_1936 triplet_* if census==1
		predict yhat_father 
		
		reg log_son_baseline yhat_father if surveys==1 [aw=wgt_sex_race], robust
		
		restore 
		
		quietly replace est_2 = _b[yhat_father] if decade==`i'
		quietly replace est_ub_2 = _b[yhat_father]+1.96*_se[yhat_father] if decade==`i'
		quietly replace est_lb_2 = _b[yhat_father]-1.96*_se[yhat_father] if decade==`i'
		
    /* 3. TSIV correcting standard errors based on two sample uncertainty
          Source: (https://ars.els-cdn.com/content/image/1-s2.0-S0165176516302373-mmc1.pdf) */

		noisily display "Two-steps: log then avg, full"

		preserve
		
		use `restricted_data_`i'', clear

    * First stage  		
		keep if census==1
		gen const=1 
		gen x1 = log_father_income_1936
		
		//robust 
		quietly reg x1 triplet_*, robust
		mat Vx2het = e(V)
		
		//non-robust
		quietly sureg (x1 = triplet_*)
		mat Vx2hom = e(V)
			
	* Second stage 
		use `restricted_data_`i'', clear
		keep if surveys==1
		gen y = log_son_baseline
		
		predict x1h, equation(x1) //generate predicted x using Census results from sureg
		
		scalar kx = 1 //number of predicted variables
		scalar ke = 1 //number of exogenous variables (constant)
		
		quietly reg y triplet_* [aw=wgt_sex_race]
		mat Vy1hom = e(V)*e(df_r)/_N
		
		quietly reg y triplet_* [aw=wgt_sex_race], robust  
		mat Vy1het = e(V)*e(df_r)/_N
		
	/* TS2SLS estimator */
		reg y x1h [aw=wgt_sex_race]
		scalar coeff1 = _b[x1h]
		display coeff1
		mat b2s = e(b)
		mat b2sx = b2s[1,1..kx]
		
		/* Constructing C hat */
		quietly reg triplet_1 x1h [aw=wgt_sex_race]
		mat ch = e(b)'
		forval b=2(1)`max_minus1' {
			quietly reg triplet_`b' x1h [aw=wgt_sex_race]
			mat ch = ch,e(b)'	
		}
		mat ch = ch,(J(kx,ke,0)\I(ke))
		
		* Calculating non-robust standard errors 
		mat var1hom = ch*Vy1hom*ch' + (b2sx' # ch)*Vx2hom*(b2sx # ch')
		mat seb2shom = vecdiag(cholesky(diag(vecdiag(var1hom))))'
		
		* Calculating robust standard errors 
		mat var1het = ch*Vy1het*ch' + (b2sx' # ch)*Vx2het*(b2sx # ch')
		mat seb2shet = vecdiag(cholesky(diag(vecdiag(var1het))))'
		
		mat list seb2shom
		mat list seb2shet
		
		scalar se1=seb2shet[1,1]
		display se1
		
		scalar ub1 = coeff1 + (1.96*se1)
		scalar lb1 = coeff1 - (1.96*se1)
		
		restore
		
	* Bring back full dataset and save estimates 
		quietly replace est_3 = coeff1 if decade==`i'
		quietly replace est_ub_3 = ub1 if decade==`i'
		quietly replace est_lb_3 = lb1 if decade==`i'
		
	}
	
	bysort decade: keep if _n==1
	keep decade est_*
	
	reshape long est_ est_lb_ est_ub_, i(decade) j(estimate)
	replace decade= decade+1 if estimate==2
	replace decade= decade+2 if estimate==3
	replace decade= decade+3 if estimate==4
	drop if decade==.
	
	save "$Mydirectory2/appendix_d/TSIV_results_1940_1936fix.dta", replace 
